Analysis OverviewΒΆ
Oakland, California is a large urban city located in the East Bay portion of the greater San Francisco Bay region. Oakland is bracketed on west by San Francisco Bay and to the east by the Oakland Hills. Large portions of urbanized Oakland now exist in areas that used to be within the bay margin or tidal marsh. The hills to the east were dominated coastal Redwood forests and similar landcover. Development of the city over the past 150 years has drastically altered the landscape.
Currently, the city has a number of programs focused on urban greenspace including "Green Streets and Raingardens" and "Greenspace and Carbon Removal". Portions of east Oakland adjacent to the Oakland Hills are very affluent. More urban parts of the city are lower on the socioeconomic spectrum.
In this analysis, we will compare tree canopy cover as measured by NDVI against median income. We will use ordinary linear regression to attempt to identify a relationship between these two factors.
import os
import pathlib
import census
import earthpy as et
import geopandas as gpd
import geoviews as gv
import gitpass
import holoviews as hv
import hvplot.pandas
import hvplot.xarray
import numpy as np
import pandas as pd
import pystac_client
import requests
import rioxarray as rxr
import rioxarray.merge as rxrm
import shapely
import sklearn
import us
import xarray as xr
import xrspatial
from census import Census
from us import states
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
data_dir = os.path.join(et.io.HOME, et.io.DATA_NAME)
for a_dir in [data_dir]:
if not os.path.exists(a_dir):
os.makedirs(a_dir)
oak_bound_path = os.path.join(
data_dir,
'oak_bound',
'cityboundary.shp'
)
# Download URL is a redirect to the zip file so using this method
# instead of et.data.get_data
oak_bound_url = ('https://data.oaklandca.gov/api/geospatial/'
'9bhq-yt6w?method=export&format=Shapefile'
)
def download_zip_from_redirect(url, save_path, file_name):
"""
Get the file from a re-direct download path
Parameters
==========
url: URL
the URL to download the file
save_path: file path
The file path to save the file to
file_name: file name
The name of the file to save
Returns
=======
file_path: file path
The file path of the downloaded file.
"""
try:
# Follow redirects and get the final URL
final_url = requests.head(url, allow_redirects=True).url
# Download the ZIP file from the final URL
response = requests.get(final_url, stream=True)
response.raise_for_status()
# Specify the path where you want to save the file
file_name = file_name + ".zip"
file_path = os.path.join(save_path, file_name)
# Save the content to the specified path
with open(str(file_path), "wb") as file: # Convert file_path to string explicitly
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"File downloaded successfully to: {file_path}")
return file_path
except requests.exceptions.RequestException as e:
print(f"Error downloading file: {e}")
return None
# Download the boundary
if not os.path.exists(oak_bound_path):
print('downloading ' + oak_bound_url)
download_zip_from_redirect(oak_bound_url, data_dir, "oakland_boundary.shp")
oak_bound_gdf = gpd.read_file(oak_bound_url).to_crs(4326)
oak_bound_gdf.hvplot(aspect='equal')
downloading https://data.oaklandca.gov/api/geospatial/9bhq-yt6w?method=export&format=Shapefile File downloaded successfully to: C:\Users\Pete\earth-analytics\data\oakland_boundary.shp.zip
# Download Census Tracts within Chicago Boundary
ca_tract_path = os.path.join(
data_dir,
'earthpy-downloads',
'tl_2023_06_tract',
'tl_2023_06_tract.shp'
)
ca_census_url = ('https://www2.census.gov/geo/tiger/'
'TIGER2023/TRACT/tl_2023_06_tract.zip'
)
if not os.path.exists(ca_tract_path):
print('downloading ' + ca_census_url)
ca_fips = us.states.CA.fips
ca_tract_shp = et.data.get_data(url=ca_census_url)
else:
print("CA census tract data already exists")
ca_tract_gdf = gpd.read_file(ca_tract_path).to_crs(4326)
#Select tracts that intersect with the Chicago boundary
oak_tract_gdf = gpd.sjoin(ca_tract_gdf, oak_bound_gdf, predicate='within')
oak_tract_gdf.hvplot(aspect='equal') * oak_bound_gdf.hvplot(aspect='equal')
CA census tract data already exists
oak_tract_gdf = gpd.sjoin(ca_tract_gdf, oak_bound_gdf, predicate='intersects')
oak_tract_gdf.hvplot(aspect='equal') * oak_bound_gdf.hvplot(aspect='equal')
# Clip census tract boundaries to San Francisco Bay edge
bay_path = os.path.join(
data_dir,
'bay_shp',
'Lake_Michigan_Shoreline.shp'
)
bay_url = ('https://spatial.lib.berkeley.edu/public/ark28722-s7d02x/data.zip')
# Download geometry for Lake Michigan
# if not os.path.exists(bay_path):
# print('downloading ' + bay_url)
# #download_zip_from_redirect(bay_url, data_dir, 'bay_shp')
# sf_bay_gdf = gpd.read_file(bay_url)
# else:
# print("Bay data already exists")
bay_gdf = gpd.read_file(bay_url).to_crs(4326)
bay_gdf = bay_gdf[bay_gdf['OBJECTID'] == 3]
# Clip the tract polygons to the edge of the lake geometry
tract_clip = gpd.overlay(oak_tract_gdf, bay_gdf, how='difference')
tract_clip.hvplot(aspect='equal')
# Download census data
# Set API key
api_key = gitpass.gitpass('US Census API Key')
c = Census(api_key)
census_fields = [
'NAME',
'B06011_001E'
]
state_code = us.states.CA.fips
county_code = '001'
#tract_list = list(oak_tract_gdf['NAME_left'])
#first_50_values = tract_list[:49]
tract_table = c.acs5.state_county_tract(fields = census_fields,
state_fips = state_code,
county_fips = county_code,
tract = '*',
year=2021)
tract_df = pd.DataFrame(tract_table)
tract_df
| NAME | B06011_001E | state | county | tract | |
|---|---|---|---|---|---|
| 0 | Census Tract 4001, Alameda County, California | 105875.0 | 06 | 001 | 400100 |
| 1 | Census Tract 4002, Alameda County, California | 98688.0 | 06 | 001 | 400200 |
| 2 | Census Tract 4003, Alameda County, California | 79947.0 | 06 | 001 | 400300 |
| 3 | Census Tract 4004, Alameda County, California | 82784.0 | 06 | 001 | 400400 |
| 4 | Census Tract 4005, Alameda County, California | 50156.0 | 06 | 001 | 400500 |
| ... | ... | ... | ... | ... | ... |
| 374 | Census Tract 9819, Alameda County, California | 173750.0 | 06 | 001 | 981900 |
| 375 | Census Tract 9820, Alameda County, California | 120893.0 | 06 | 001 | 982000 |
| 376 | Census Tract 9821, Alameda County, California | 4561.0 | 06 | 001 | 982100 |
| 377 | Census Tract 9832, Alameda County, California | 127273.0 | 06 | 001 | 983200 |
| 378 | Census Tract 9900, Alameda County, California | -666666666.0 | 06 | 001 | 990000 |
379 rows Γ 5 columns
# Join census data to tracts gdf
oak_tract_gdf = tract_clip.rename(columns={'TRACTCE': 'tract'})
oak_tract_census_gdf = oak_tract_gdf.merge(tract_df, on='tract')
oak_tract_census_gdf = oak_tract_census_gdf.rename(columns={'B06011_001E': 'Median_Income'})
oak_tract_census_gdf.set_index('tract')
oak_tract_census_gdf = oak_tract_census_gdf[oak_tract_census_gdf['Median_Income'] >= 0]
oak_tract_census_gdf
gv.tile_sources.OSM() * gv.Polygons(oak_tract_census_gdf)
# Download NAIP data per census tract
pc_catalog = pystac_client.Client.open(
"https://planetarycomputer.microsoft.com/api/stac/v1"
)
pc_catalog.title
'Microsoft Planetary Computer STAC API'
# Accumulate URLs for NAIP per census tract
naip_url_dfs = []
for index, row in oak_tract_census_gdf.iterrows():
try:
#print("Getting info for tract " + row.tract)
row_geometry = row['geometry']
naip_search = pc_catalog.search(
collections=["naip"],
intersects=shapely.to_geojson(row_geometry),
datetime=f"2022"
)
for naip_item in naip_search.items():
if naip_item:
print("Getting info for tract " + naip_item.id)
naip_url_dfs.append(
pd.DataFrame(dict(
tract=[row.tract],
tile_id=[naip_item.id],
url=[naip_item.assets['image'].href]
))
)
except:
print("Error occurred while getting info for tract " + row.tract)
continue
# Merge results into a dataframe for analysis
naip_url_df = pd.concat(naip_url_dfs)
naip_url_df
tract_ndvi_accumulator = []
for tract, naip_item_df in naip_url_df.groupby('tract'):
print(tract)
tract_ndvi_das = []
for i, naip_url_s in naip_item_df.iterrows():
# Open NAIP data array
full_naip_vda = rxr.open_rasterio(naip_url_s.url, masked=True).squeeze()
# Get census tract boundary
boundary_gdf = oak_tract_census_gdf.to_crs(full_naip_vda.rio.crs)[oak_tract_census_gdf.tract==tract]
# Clip NAIP data to tract boundary
try:
crop_naip_vda = full_naip_vda.rio.clip_box(
*boundary_gdf.total_bounds)
naip_vda = crop_naip_vda.rio.clip(boundary_gdf.geometry)
except:
print("error clipping NAIP for tract " + tract)
continue
# Compute NDVI
tract_ndvi_das.append(
(naip_vda.sel(band=4) - naip_vda.sel(band=1))
/ (naip_vda.sel(band=4) + naip_vda.sel(band=1))
)
#print(tract_ndvi_das)
# Merge rasters
if len(tract_ndvi_das)>1:
tract_ndvi_da = rxrm.merge_arrays(tract_ndvi_das)
else:
tract_ndvi_da = tract_ndvi_das[0]
# Calculate percent of cells with NDVI value above threshold
tract_total_pixels = tract_ndvi_da.notnull().sum()
threshold_pixels = tract_ndvi_da > .12
threshold_pixels_sum = threshold_pixels.sum()
threshold_pct = (
threshold_pixels_sum / tract_total_pixels
* 100
)
print(threshold_pct)
tract_ndvi_accumulator.append(
pd.DataFrame(dict(
tract=[tract],
naip_pct=[threshold_pct.item()]
))
)
tract_ndvi_df = pd.concat(tract_ndvi_accumulator)
# Save NDVI analysis output to CSV file
csv_path = os.path.join(data_dir, 'oakland_ndvi_by_tract.csv')
if not os.path.exists(csv_path):
print('copying results to csv ')
tract_ndvi_df.to_csv(csv_path, mode='a', index=False)
else:
print("NDVI results csv already exists.")
copying results to csv
# Load in CSV results to dataframe
ndvi_df = pd.read_csv(csv_path)
# Remove all rows of NDVI results with values below zero
ndvi_df = ndvi_df[ndvi_df['naip_pct'] >= 0]
oak_tract_census_gdf['tract'] = oak_tract_census_gdf['tract'].astype('int64')
oak_tract_final = pd.merge(oak_tract_census_gdf, ndvi_df, on='tract')
oak_tract_final
# QA/QC on values for analysis
layout_chi = hv.Layout()
income_min = oak_tract_final['Median_Income'].min()
income_max = oak_tract_final['Median_Income'].max()
print("Range of values in the 'median_income' column:")
print("Min:", income_min)
print("Max:", income_max)
ndvi_min = oak_tract_final['naip_pct'].min()
ndvi_max = oak_tract_final['naip_pct'].max()
print("Range of values in the NDVI pct column:")
print("Min:", ndvi_min)
print("Max:", ndvi_max)
Range of values in the 'median_income' column: Min: 4561.0 Max: 173750.0 Range of values in the NDVI pct column: Min: 1.2103743075253135 Max: 58.35590104804
# Create cholorpleth plots to confirm results
med_income_plot = oak_tract_final.hvplot(c='Median_Income',
cmap='Viridis',
geo=True,
tiles='CartoLight',
width=400,
height=400,
title='Choropleth Plot - Median Income')
ndvi_plot = oak_tract_final.hvplot(c='naip_pct',
cmap='Viridis',
geo=True,
tiles='CartoLight',
width=400,
height=400,
title='Choropleth Plot - NDVI % above Threshold')
combined_plot = med_income_plot + ndvi_plot
combined_plot
#Transform median income values to the log of the values
oak_tract_final['log_median_income'] = np.log(oak_tract_final['Median_Income'])
oak_tract_final = oak_tract_final.set_index('tract')
oak_tract_final = oak_tract_final.rename(columns={'naip_pct': 'ndvi_pct'})
oak_tract_final['log_ndvi_pct'] = np.log(oak_tract_final['ndvi_pct'])
# Display a scatter matrix of my variables.
hvplot.scatter_matrix(
oak_tract_final[['log_median_income', 'log_ndvi_pct']]
)
# Set up test and train data for OLR model
oak_tract_final[['log_median_income', 'log_ndvi_pct']]
X = oak_tract_final[['log_median_income']]
y= oak_tract_final[['log_ndvi_pct']]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
X, y, random_state=42)
X_train, X_test, y_train, y_test
# Run linear regression model
linear_reg= linear_model.LinearRegression().fit(X_train, y_train)
y_measured = np.exp(y_test)
y_hat = np.exp(linear_reg.predict(X_test))
# Process results
test_df = y_test.copy()
test_df['predicted_ndvi_pct'] = y_hat
test_df['measured_ndvi_pct'] = y_measured
test_df
# Plot results
y_max = float(test_df.measured_ndvi_pct.max())
(
test_df
.hvplot.scatter(x='measured_ndvi_pct', y='predicted_ndvi_pct')
.opts(aspect='equal', xlim=(0, y_max), ylim=(0, y_max), width=600, height=600)
) * hv.Slope(slope=1, y_intercept=0).opts(color='black')
# Calculate the error between measured NDVI % and predicted
oak_tract_final['predicted_ndvi_pct'] = np.exp(linear_reg.predict(X))
oak_tract_final['error_ndvi_pct'] = (
oak_tract_final['predicted_ndvi_pct']
- oak_tract_final['ndvi_pct']
)
# Plot out the error in a chloropleth map
results_plot = oak_tract_final.hvplot(c='error_ndvi_pct',
cmap='RdBu',
geo=True,
tiles='CartoLight',
width=600,
height=600,
title='Choropleth Plot - Median Income')
results_plot
Analysis SummaryΒΆ
We were looking at a possible relationship between median income and percent of NDVI above .12 per tract, assuming that as median income increased, NDVI above threshold would increase as well. However, we are seeing a large discrepancy in the predicted relationship in the eastern areas of Oakland. This area, known as the Oakland Hills is generally very affluent but also has a lot of vegetated cover. It appears that the range of income in the affluent sections may not be being represented by the census tract aggregation, thus the large error in the NDVI prediction. Areas in the central portion of Oakland actually appear to have little error in the predicted NDVI versus median income. This suggests that our analysis is doing a decent job of predicting the relationship for more urban parts of the city, but failing in the eastern Oakland Hills area.